import time
time_start_notebook = time.time()
import numpy as np
import pandas as pd
pd.plotting.register_matplotlib_converters() # to plot timeseries
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 8,8
plt.rcParams.update({'font.size': 16})
plt.style.use('ggplot')
%matplotlib inline
import seaborn as sns
sns.set(color_codes=True)
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.tools as tls
from plotly.subplots import make_subplots
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=False)
[(x.__name__,x.__version__) for x in [plotly]]
import plotly.express as px
import pycountry
import pycountry_convert as pc
import country_converter as coco
from functools import lru_cache
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;
# color pallette
CNF = '#393e46' # confirmed - grey
DTH = '#ff2e63' # death - red
REC = '#21bf73' # recovered - cyan
ACT = '#fe9801' # active case - yellow
!ls ../data/kaggle
home = '../data/kaggle/'
dfc = pd.read_csv(home + 'covid_19_clean_complete.csv')
dfu = pd.read_csv(home + 'us_covid19_daily.csv')
dfus = pd.read_csv(home + 'us_states_covid19_daily.csv')
dfc.shape, dfu.shape, dfus.shape
dfc.head(2).append(dfc.tail(2))
dfu.head(2).append(dfu.tail(2))
dfus.head(2).append(dfus.tail(2))
dfc['Active'] = dfc['Confirmed'] - dfc['Recovered']
dfc.head(2)
# Clean name of China
dfc.nlargest(1,'Confirmed')
dfc['Country/Region'].value_counts()
dfc['Country/Region'] = dfc['Country/Region'].replace('Mainland China', 'China')
# fill nans of states
dfc[['Province/State']] = dfc[['Province/State']].fillna('unknown')
dfs =dfc[dfc['Province/State'].str.contains('Grand Princess')|dfc['Province/State'].str.contains('Diamond Princess cruise ship')]
dfs.head(2)
@lru_cache(maxsize=None)
def do_fuzzy_search(country):
try:
result = pycountry.countries.search_fuzzy(country)
return result[0].alpha_2
except:
return np.nan
dfc['country_code'] = dfc["Country/Region"]\
.apply(lambda country: do_fuzzy_search(country))
dfc.head()
dfc[dfc.country_code.isnull()].shape
dfc[dfc.country_code.isnull()].head(2)
dfc.loc[dfc.country_code.isnull(),'Country/Region'].unique()
bad_names = {'Korea, South':'KR',
'Taiwan*':'TW',
'Congo (Kinshasa)':'CD',
'occupied Palestinian territory':'PS',
'Congo (Brazzaville)':'CD',
'Taipei and environs':'TW',
'Iran (Islamic Republic of)':'IR',
'Channel Islands':'GB',
}
for k,v in bad_names.items():
# print(k,v)
dfc["country_code"] = dfc.apply(lambda row: v
if row['Country/Region'] == k
else row['country_code'],
axis = 1)
dfc.loc[dfc.country_code.isnull(),'Country/Region'].unique()
@lru_cache(maxsize=None)
def do_continent_search(alpha2):
try:
result = pc.country_alpha2_to_continent_code(alpha2)
return result
except Exception:
return np.nan
dfc['continent'] = dfc["country_code"].apply(lambda x: do_continent_search(x))
# Add vatican city to EU
dfc["continent"] = dfc.apply(lambda row:"EU" if row['country_code'] == 'VA' else row['continent'],axis = 1)
dfc.head(2)
@lru_cache(maxsize=None)
def country(alpha2):
try:
result =coco.convert(names=alpha2,
to='name_short',
not_found = np.nan)
return result
except Exception:
return np.nan
dfc['Country/Region'] = dfc['country_code'].apply(lambda x:country(x))
dfc.head(2)
dfc[dfc['Country/Region']=='Korea, South']
dfc[dfc['Country/Region']=='South Korea'].head(2)
dfc.dtypes
dfc['Date'] = pd.to_datetime(dfc['Date'])
dfc.head(2)
latest = dfc[dfc["Date"] == max(dfc["Date"])].reset_index()
latest = latest.groupby('Country/Region').sum().reset_index()
print(latest.shape)
latest.head(2).append(latest.tail(2))
def plotly_top20_countries(df,col,color):
dfx = df.sort_values(col, ascending=False)\
.head(20)\
.sort_values('Confirmed', ascending=True)
fig = px.bar(dfx,
x=col,
y="Country/Region",
title=f'Total{col} Cases',
text=col,
orientation='h',
width=700,
height=700,
range_x = [0, max(latest[col])*1.1])
fig.update_traces(marker_color=color,
opacity=0.8,
textposition='outside')
fig.show()
plotly_top20_countries(latest,'Confirmed',CNF)
plotly_top20_countries(latest,'Deaths',DTH)
plotly_top20_countries(latest,'Recovered',REC)
plotly_top20_countries(latest,'Active',ACT)
top20deaths = latest.nlargest(20,'Deaths')
fig = px.scatter(top20deaths,
x='Confirmed',
y='Deaths',
color='Country/Region',
text='Country/Region',
log_x=True,
log_y=True,
height=800,
width=600,
title='Deaths vs Confirmed')
fig.update_traces(textposition='top center')
fig.show()
top5 = latest.nlargest(5, 'Confirmed')
top5
def plotly_barplot(df,title):
trace1 = go.Bar(name='Confirmed',
x=df["Country/Region"],
y=df['Confirmed'],
marker_color = 'rgb(55, 83, 109)')
trace2 = go.Bar(name='Active',
x=df["Country/Region"],
y=df['Active'],
marker_color = 'lightsalmon')
trace3 = go.Bar(name='Revovered',
x=df["Country/Region"],
y=df['Recovered'],
marker_color = 'green' )
trace4 = go.Bar(name='Deaths',
x=df["Country/Region"],
y=df['Deaths'],
marker_color = 'crimson' )
data = [trace1, trace2, trace3, trace4]
fig = go.Figure(data=data)
fig.update_layout(barmode='group',title_text=title)
fig.layout.template ='plotly_dark'
fig.show()
title = 'Latest Top 5 countries'
plotly_barplot(top5,title)
dfx = latest[latest.Confirmed < 10]
dfx = dfx.nlargest(5,'Confirmed')
dfx
title = 'Latest Top 5 countries with Confirmed < 10'
plotly_barplot(dfx,title)
dfx =dfc.groupby('Country/Region').sum().reset_index()
unq =dfx.loc[dfx['Confirmed']>1000, 'Country/Region'].unique()
unq.shape, unq
dfy = dfc.groupby(['Country/Region','Date']).sum().reset_index()
dfy = dfy[dfy['Country/Region'].isin(unq)]
print(dfy.shape)
dfy.head(2)
gb = dfy.groupby('Country/Region')
def plotly_mulitplots(df,unq,title):
counter = 0
fig = make_subplots(rows=unq.reshape(-1,3).shape[0],
cols=unq.reshape(-1,3).shape[1],
subplot_titles=unq)
for i in range(1, unq.reshape(-1,3).shape[0]+1):
for j in range(1, unq.reshape(-1,3).shape[1]+1):
grp = gb.get_group(unq[counter])
fig.add_trace(go.Scatter(x=grp['Date'],
y=grp['Confirmed'],
mode='lines',
name=unq[counter]
),
row=i,
col=j)
counter += 1
fig.update_layout(height=2000,
width=1400,
title_text=title,
showlegend=False)
fig.layout.template ='plotly_dark'
fig.show()
title = "Countries with Confirmed Cases > 1_000"
plotly_mulitplots(dfy,unq,title)
dfx = dfc[dfc["Date"] == max(dfc["Date"])].reset_index()
dfx = dfx.groupby('continent').sum().reset_index()\
.sort_values('Confirmed',ascending =False)
dfx
def plotly_continent_plot(df,title):
trace1 = go.Bar(name='Active',
x=df["continent"],
y=df['Active'],
marker_color = 'rgb(55, 83, 109)')
trace2 = go.Bar(name='Revovered',
x=df["continent"],
y=df['Recovered'],
marker_color='green')
trace3 = go.Bar(name='Deaths',
x=df["continent"],
y=df['Deaths'],
marker_color='crimson' )
data = [trace1,trace2,trace3]
fig = go.Figure(data)
fig.update_layout(barmode='group',title_text=title)
fig.layout.template ='plotly_dark'
fig.show()
title = 'Total Confirmed Cases per Continent'
plotly_continent_plot(dfx,title)
tmp = dfc.groupby('Date').sum().reset_index()
fig = go.Figure()
fig.add_trace(go.Scatter(x=tmp['Date'], y=tmp['Confirmed'],
mode='lines',
name='Confirmed'))
fig.add_trace(go.Scatter(x=tmp['Date'], y=tmp['Recovered'],
mode='lines',
name='Recovered'))
fig.add_trace(go.Scatter(x=tmp['Date'], y=tmp['Deaths'],
mode='lines',
name='Deaths'))
fig.update_layout(barmode='stack',title_text ='Line plot fo Growth')
fig.layout.template ='plotly_dark'
fig.show()
fig = go.Figure()
fig.add_trace(go.Scatter(x=tmp['Date'], y=tmp['Active'],
mode='lines',
name='Active'))
fig.add_trace(go.Scatter(x=tmp['Date'], y=tmp['Recovered'],
mode='lines',
name='Recovered'))
fig.update_layout(barmode='stack',title_text ='Covid Patterns')
fig.layout.template ='plotly_dark'
fig.show()
dfx = dfc.groupby('Date')['Recovered', 'Deaths', 'Active']\
.sum().reset_index()
dfx.head()
dfx = dfc.groupby('Date')\
.agg({'Recovered':'sum',
'Deaths':'sum',
'Active':'sum'})\
.reset_index()
dfx.head()
dfx = dfx.melt(id_vars="Date",
value_vars=['Recovered', 'Deaths', 'Active'],
var_name='Case',
value_name='Count')
dfx.head()
def plotly_area_plot(dfx,title):
fig = px.area(dfx, x="Date", y="Count", color='Case')
fig.update_layout(barmode='stack',title_text=title)
fig.layout.template ='plotly_dark'
fig.show()
title = 'Area plot of Growth of Total Numbers'
plotly_area_plot(dfx,title)
tmp = dfc.groupby('Date').agg({'Confirmed':'sum',
'Deaths':'sum',
'Recovered':'sum',
'Active':'sum'}).reset_index()
tmp = tmp[tmp['Date']==max(tmp['Date'])].reset_index(drop=True)
tmp.style.background_gradient(cmap='Pastel1')
tm = tmp.melt(id_vars="Date", value_vars=['Active', 'Deaths', 'Recovered'])
tm
fig = px.treemap(tm,
path=["variable"],
values="value",
height=400,
width=600,
color_discrete_sequence=[REC, ACT, DTH])
fig.show()
def treemap_countries(dfx,exclude=0):
tmp = dfx.sort_values(by='Confirmed', ascending=False)\
.reset_index(drop=True).iloc[exclude:,:]
tmp2 = dfx.sort_values(by='Deaths', ascending=False)\
.reset_index(drop=True).iloc[exclude:,:]
fig = px.treemap(tmp,
path=["Country/Region"],
values="Confirmed", height=700,
title='Number of Confirmed Cases',
color_discrete_sequence=px.colors.qualitative.Prism)
fig.data[0].textinfo = 'label+text+value'
fig.show()
fig = px.treemap(tmp2,
path=["Country/Region"],
values="Deaths", height=700,
title='Number of Deaths reported',
color_discrete_sequence=px.colors.qualitative.Prism)
fig.data[0].textinfo = 'label+text+value'
fig.show()
treemap_countries(latest,exclude=0)
treemap_countries(latest,exclude=7)
top10 = dfc.groupby('Country/Region')['Confirmed'].sum()\
.nlargest(10).index.tolist()
top10
tmp = dfc.groupby(['Date','Country/Region'])\
.agg({'Confirmed':"sum"})\
.reset_index()\
.sort_values('Confirmed',ascending=False)\
.loc[lambda x: x['Country/Region'].isin(top10)]
tmp.head(2)
fig = px.line(tmp,x="Date",
y="Confirmed",
color = "Country/Region")
fig.update_layout(title_text='Country-Wise Covid Distribution')
fig.layout.template='plotly_dark'
fig.show()
tmp = dfc.groupby('continent').sum()
tmp[["Confirmed","Deaths","Recovered","Active"]].style.background_gradient(cmap='Reds')
dfx = dfc.groupby(['Date','continent'])\
.agg({'Confirmed':"sum"})\
.reset_index()
fig = px.line(dfx,
x="Date",
y="Confirmed",
color = "continent")
title = 'Total Continent-Wise Covid Distribution'
fig.update_layout(title_text=title)
fig.layout.template ='plotly_dark'
fig.show()
latest.head(2)
spread = dfc[dfc['Confirmed']!=0]\
.groupby('Date')['Country/Region']\
.unique()\
.apply(len)
spread = pd.DataFrame(spread).reset_index()
spread.head()
fig = px.line(spread,
x='Date',
y='Country/Region',
text='Country/Region',
title='Number of Countries/Regions to which COVID-19 spread over the time',
color_discrete_sequence=[CNF,DTH, REC])
fig.update_traces(textposition='top center')
fig.show()
tmp = dfc[dfc["Date"] == max(dfc["Date"])].reset_index()
tmp = tmp.groupby('Country/Region').sum().reset_index()
print(tmp.shape)
tmp.head(2)
def plotly_mapplot(dfx,color_col,range_color=None,
colorscale='Plasma'):
fig = px.choropleth(tmp,
locations="Country/Region",
color=color_col,
locationmode='country names',
hover_name="Country/Region",
range_color=range_color,
title=f'Latest Total {color_col} Cases',
color_continuous_scale=colorscale
)
fig.update(layout_coloraxis_showscale=True)
fig.show()
plotly_mapplot(tmp,'Confirmed',[1,7_000])
plotly_mapplot(tmp,'Deaths',[1,7],'Earth')
plotly_mapplot(tmp,'Recovered',[1,7],'Portland')
tmp.head(2)
import folium
m = folium.Map(location=[0, 0], tiles='cartodbpositron',
min_zoom=1, max_zoom=4, zoom_start=1)
for i in range(0, len(tmp)):
folium.Circle(
location=[tmp.iloc[i]['Lat'], tmp.iloc[i]['Long']],
color='crimson',
tooltip = ('<li><bold>Country : '+
str(tmp.iloc[i]['Country/Region'])+
'<li><bold>Confirmed : '+
str(tmp.iloc[i]['Confirmed'])+
'<li><bold>Deaths : '+
str(tmp.iloc[i]['Deaths'])+
'<li><bold>Recovered : '+
str(tmp.iloc[i]['Recovered'])
),
radius=int(tmp.iloc[i]['Confirmed'])**1.1).add_to(m)
m
tmp = dfc.groupby(['Date', 'Country/Region'])\
.agg({'Confirmed':'max',
'Deaths':'max',
'Recovered':'max'})
tmp = tmp.reset_index()
tmp['size'] = tmp['Confirmed'].pow(0.3)
tmp['Date'] = pd.to_datetime(tmp['Date'])
tmp['Date'] = tmp['Date'].dt.strftime('%m/%d/%Y')
tmp.head()
fig = px.scatter_geo(tmp, locations="Country/Region", locationmode='country names',
color="Confirmed", size='size', hover_name="Country/Region",
range_color= [0, max(tmp['Confirmed'])+2],
projection="natural earth", animation_frame="Date",
title='Spread over time')
fig.update(layout_coloraxis_showscale=False)
fig.show()
from IPython.display import HTML
HTML('''<div class="flourish-embed flourish-bar-chart-race" data-src="visualisation/1571387"><script src="https://public.flourish.studio/resources/embed.js"></script></div>''')
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken: {:.0f} hr {:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))